Alison’s EDA

Alison’s EDA before collaboration

Alison Yao
11/22/2021

Load Dataset

First, read the 3 csv files downloaded from 2019 OECD study of violence against women.

attitude_df = read.csv('./Data/attitude.csv')
law_df = read.csv('./Data/laws.csv')
prevalence_df = read.csv('./Data/prevalence.csv')

Initial Findings

By just eyeballing the data, I can see that: 1. Not all countries are listed and not all countries have record on all three features. 2. The range of three features are different (0-100% or 0-1). Strange thing is that law only has 4 values: 0.25, 0.5, 0.75 and 1. It looks more categorical than continuous. 3. The countries are abbreviated…so I need some way to match the code with the names

# Data Source: https://www.iban.com/country-codes
# country_code_df = read.csv('./Data/country_code.csv')
# check_code <- function(df){
#    for (code in df$LOCATION){
#     if (code %in% country_code_df$Abb3) {
#     } else {
#       print(code, 'not found!')
#     }
#   }
# }
# check_code(attitude_df)
# check_code(law_df)
# check_code(prevalence_df)
# # There should not be any output - Awesome!

Wait a second, I recall that we used similar data in Lab06. And it also provides continent data. This is even better.

covid_data <- read_csv("https://wzb-ipi.github.io/corona/df_full.csv")
covid_subset <- covid_data[c('geoid2', 'country', 'continent', 'gdp_pc')]
covid_subset <- covid_subset[!duplicated(covid_subset), ]
covid_subset
# A tibble: 218 × 4
   geoid2 country              continent gdp_pc
   <chr>  <chr>                <chr>      <dbl>
 1 ABW    Aruba                America    NA   
 2 AFG    Afghanistan          Asia        2.19
 3 AGO    Angola               Africa      6.93
 4 AIA    Anguilla             America    NA   
 5 ALB    Albania              Europe     13.6 
 6 AND    Andorra              Europe     NA   
 7 ARE    United Arab Emirates Asia       67.0 
 8 ARG    Argentina            America    22.7 
 9 ARM    Armenia              Europe     12.7 
10 ASM    American Samoa       Oceania    NA   
# … with 208 more rows
# check if there are duplicated countries by running
# dplyr::count(covid_subset, geoid2, sort = TRUE)
check_code <- function(df){
   for (code in df$LOCATION){
    if (code %in% covid_subset$geoid2) {
    } else {
      print(paste0("Not Found: ", code))
    }
  }
}
check_code(attitude_df)
[1] "Not Found: TKM"
[1] "Not Found: HKG"
check_code(law_df)
[1] "Not Found: HKG"
[1] "Not Found: TKM"
check_code(prevalence_df)
[1] "Not Found: HKG"
# There should not be any output - but there are two

Let’s add them manually.

covid_subset <- covid_subset %>% add_row(geoid2 = "HKG", 
                         country = "Hong Kong",
                         continent = "Asia", 
                         gdp_pc = NA)
covid_subset <- covid_subset %>% add_row(geoid2 = "TKM", 
                         country = "Turkmenistan",
                         continent = "Asia", 
                         gdp_pc = NA)
colnames(covid_subset) <- c('Country', 'CountryName', 'Continent', 'GDP_per_capita')
covid_subset
# A tibble: 220 × 4
   Country CountryName          Continent GDP_per_capita
   <chr>   <chr>                <chr>              <dbl>
 1 ABW     Aruba                America            NA   
 2 AFG     Afghanistan          Asia                2.19
 3 AGO     Angola               Africa              6.93
 4 AIA     Anguilla             America            NA   
 5 ALB     Albania              Europe             13.6 
 6 AND     Andorra              Europe             NA   
 7 ARE     United Arab Emirates Asia               67.0 
 8 ARG     Argentina            America            22.7 
 9 ARM     Armenia              Europe             12.7 
10 ASM     American Samoa       Oceania            NA   
# … with 210 more rows

Now we can analyze patterns across continents if we want to.

Join Data

I will put everything in a dataframe by full outer joining three subsets of the dataframes.

colnames(attitude_df) <- c('LOCATION', 'INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'TIME', 'Value', 'Flag.Codes')
attitude_sub <- attitude_df[c('LOCATION', 'Value')]
colnames(attitude_sub) <- c('Country', 'Attitude')

colnames(prevalence_df) <- c('LOCATION', 'INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'TIME', 'Value', 'Flag.Codes')
prevalence_sub <- prevalence_df[c('LOCATION', 'Value')]

colnames(law_df) <- c('LOCATION', 'INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'TIME', 'Value', 'Flag.Codes')
law_sub <- law_df[c('LOCATION', 'Value')]
colnames(law_sub) <- c('Country', 'Law')

colnames(prevalence_sub) <- c('Country', 'Prevalence')
df <- full_join(attitude_sub, prevalence_sub, by = "Country")
df <- full_join(df, law_sub, by = "Country")
df <- left_join(df, covid_subset, by = 'Country')
df
    Country Attitude Prevalence  Law              CountryName
1       AUS      3.2       16.9 0.75                Australia
2       CAN      7.8        1.9 0.25                   Canada
3       FIN     11.2       30.0 0.75                  Finland
4       FRA      6.6       26.0 0.25                   France
5       DEU     19.6       22.0 0.75                  Germany
6       HUN      8.7       21.0 0.75                  Hungary
7       ITA      5.3       19.0 0.75                    Italy
8       JPN      8.9       15.4 0.75                    Japan
9       KOR     18.4       16.5 0.25              South Korea
10      MEX      5.0       14.1 0.50                   Mexico
11      NLD      6.4       25.0 0.75              Netherlands
12      NZL      7.7       35.0 0.75              New Zealand
13      NOR      9.9       27.0 0.25                   Norway
14      POL      7.9       13.0 0.75                   Poland
15      ESP      9.6       13.0 0.50                    Spain
16      SWE     10.2       28.0 0.25                   Sweden
17      CHE     15.2        9.8 0.75              Switzerland
18      TUR     13.3       38.0 0.25                   Turkey
19      GBR     10.2       29.0 0.75           United Kingdom
20      USA     11.0       35.6 0.50            United States
21      ALB     29.8       24.6 0.50                  Albania
22      DZA     48.2         NA 0.50                  Algeria
23      ARG     11.6         NA 0.75                Argentina
24      ARM     10.1        8.2 0.75                  Armenia
25      AZE     28.0       13.5 0.75               Azerbaijan
26      BGD     28.3       53.3 0.75               Bangladesh
27      BLR      4.1       25.0 0.75                  Belarus
28      BIH      4.8       13.1 0.50     Bosnia & Herzegovina
29      BRA      8.5       33.5 0.25                   Brazil
30      BGR     18.2       23.0 0.75                 Bulgaria
31      KHM     50.4       20.9 0.50                 Cambodia
32      CHL     10.3        6.7 0.75                    Chile
33      CHN     32.7         NA 0.75                    China
34      COL     11.1       37.4 0.25                 Colombia
35      EGY     35.7         NA 0.75                    Egypt
36      EST     16.9       20.0 0.25                  Estonia
37      ETH     63.0       28.0 0.50                 Ethiopia
38      GEO      8.6        6.1 0.75                  Georgia
39      GHA     28.3       24.4 0.75                    Ghana
40      HTI     58.9       20.8 0.75                    Haiti
41      IND     22.1       28.7 0.50                    India
42      IDN     34.5       18.3 0.75                Indonesia
43      IRN     21.0       66.0 0.75                     Iran
44      KAZ     14.2       16.5 0.75               Kazakhstan
45      MKD     14.5       27.7 0.75          North Macedonia
46      MYS     41.5         NA 0.75                 Malaysia
47      MDA     11.2       45.5 0.25                  Moldova
48      MOZ     22.9       21.7 0.75               Mozambique
49      NGA     34.7       16.2 0.75                  Nigeria
50      PAK     42.2       85.0 0.50                 Pakistan
51      PER     32.2       33.2 0.75                     Peru
52      PHL     12.9       16.9 0.75              Philippines
53      ROU      7.5       24.0 0.25                  Romania
54      RUS     23.3       19.6 1.00                   Russia
55      SGP     40.5        6.1 0.50                Singapore
56      SVN     15.8       13.0 0.25                 Slovenia
57      ZAF     61.2       20.6 0.50             South Africa
58      SDN     34.0         NA 0.75                    Sudan
59      TZA     58.0       41.7 0.75                 Tanzania
60      THA      8.6       44.2 0.75                 Thailand
61      UKR      2.9       13.2 0.75                  Ukraine
62      URY      1.5       14.8 0.75                  Uruguay
63      VNM     28.2       34.4 0.75                  Vietnam
64      ZMB     46.9       42.7 0.50                   Zambia
65      SRB      3.8       23.7 0.25                   Serbia
66      AFG     80.2       60.8 0.75              Afghanistan
67      BEN     36.0       68.6 0.50                    Benin
68      BTN     68.4       26.5 0.50                   Bhutan
69      BOL     16.1       64.1 0.25                  Bolivia
70      BFA     43.5       11.5 0.75             Burkina Faso
71      BDI     72.9       46.7 0.50                  Burundi
72      CMR     36.1       51.1 0.75                 Cameroon
73      CAF     79.6       29.8 0.25 Central African Republic
74      TCD     73.5       28.6 0.75                     Chad
75      COG     54.2         NA 0.75      Congo - Brazzaville
76      COD     74.8       50.7 0.75         Congo - Kinshasa
77      DOM      2.0       20.4 0.25       Dominican Republic
78      ECU     25.2       37.5 0.50                  Ecuador
79      GNQ     52.6       56.9 1.00        Equatorial Guinea
80      ERI     51.4         NA 0.75                  Eritrea
81      GAB     50.2       48.6 0.75                    Gabon
82      GMB     58.4       20.1 0.50                   Gambia
83      GTM     11.0       18.0 0.75                Guatemala
84      GIN     92.1       80.0 0.75                   Guinea
85      GNB     41.8         NA 0.75            Guinea-Bissau
86      HND     12.4       21.6 0.25                 Honduras
87      IRQ     54.8       21.2 0.75                     Iraq
88      JAM      4.9       19.7 0.75                  Jamaica
89      JOR     18.0       23.6 0.75                   Jordan
90      KEN     41.8       39.4 0.50                    Kenya
91      KWT     37.0         NA 0.75                   Kuwait
92      KGZ     32.8       25.4 0.75               Kyrgyzstan
93      LAO     58.2       15.3 0.25                     Laos
94      LBN     43.6       10.4 0.75                  Lebanon
95      LSO     37.1       62.0 0.75                  Lesotho
96      LBR     42.5       38.5 0.75                  Liberia
97      LBY     25.1         NA 0.75                    Libya
98      MDG     45.2       30.0 0.50               Madagascar
99      MWI     16.3       37.5 0.50                   Malawi
100     MLI     72.6       34.6 0.75                     Mali
101     MNG     10.1       31.2 0.25                 Mongolia
102     MAR     22.0       30.0 0.75                  Morocco
103     NAM     28.2       25.0 0.25                  Namibia
104     NPL     42.9       25.0 0.25                    Nepal
105     NIC     13.7       22.5 0.25                Nicaragua
106     NER     59.6         NA 0.75                    Niger
107     PSE     38.7         NA 0.75  Palestinian Territories
108     QAT      6.6         NA 0.75                    Qatar
109     RWA     41.4       34.4 0.50                   Rwanda
110     SEN     56.5       78.0 0.50                  Senegal
111     SLE     62.8       45.3 0.50             Sierra Leone
112     SOM     75.7         NA 0.50                  Somalia
113     LKA     53.2       16.6 0.50                Sri Lanka
114     SWZ     19.9         NA 0.75                 Eswatini
115     TJK     59.6       20.3 0.75               Tajikistan
116     TLS     86.2       58.8 0.50              Timor-Leste
117     TGO     28.7       22.1 0.75                     Togo
118     TTO      9.9       30.2 0.75        Trinidad & Tobago
119     TUN     18.6       20.3 0.50                  Tunisia
120     TKM     26.3         NA 0.75             Turkmenistan
121     UGA     58.3       49.9 0.75                   Uganda
122     UZB     41.5         NA 0.75               Uzbekistan
123     YEM     48.7       67.0 0.75                    Yemen
124     ZWE     38.7       35.4 0.50                 Zimbabwe
125     AUT      3.0       13.0 0.25                  Austria
126     BEL      2.0       24.0 0.50                  Belgium
127     CZE      2.0       21.0 0.75                  Czechia
128     DNK      0.0       32.0 0.50                  Denmark
129     GRC      2.0       19.0 0.25                   Greece
130     IRL      1.0       15.0 0.25                  Ireland
131     LUX      2.0       22.0 0.75               Luxembourg
132     PRT      2.0       19.0 0.25                 Portugal
133     SVK      5.0       23.0 0.25                 Slovakia
134     AGO     25.2       34.8 0.75                   Angola
135     CRI      3.5       36.0 0.75               Costa Rica
136     CIV     47.9       25.9 0.75      C\xf4te d\x92Ivoire
137     HRV      4.4       13.0 0.25                  Croatia
138     CUB      3.9         NA 0.75                     Cuba
139     CYP     10.5       15.0 0.25                   Cyprus
140     SLV      7.7       26.3 0.50              El Salvador
141     FJI     42.6       64.1 0.75                     Fiji
142     HKG     26.0        9.4 0.75                Hong Kong
143     LVA      2.0       32.0 0.75                   Latvia
144     LTU      2.0       24.0 0.50                Lithuania
145     MLT      0.0       15.0 0.25                    Malta
146     MRT     26.6         NA 0.75               Mauritania
147     MMR     51.2       33.0 0.75          Myanmar (Burma)
148     OMN      7.9         NA 0.75                     Oman
149     PAN      5.7         NA 0.25                   Panama
150     PRY     22.9       17.9 0.50                 Paraguay
151     TWN     21.6         NA 0.25                   Taiwan
152     MNE      2.7         NA 0.25               Montenegro
153     ISL       NA       22.4 0.25                  Iceland
154     ISR       NA         NA 0.75                   Israel
155     SAU       NA         NA 0.75             Saudi Arabia
156     ARE       NA         NA 0.75     United Arab Emirates
157     BHR       NA         NA 0.75                  Bahrain
158     BWA       NA         NA 0.75                 Botswana
159     MUS       NA         NA 0.50                Mauritius
160     PNG       NA         NA 0.75         Papua New Guinea
161     SYR       NA         NA 0.75                    Syria
162     VEN       NA         NA 0.25                Venezuela
163     BRN       NA         NA 0.75                   Brunei
    Continent GDP_per_capita
1     Oceania     49.5759811
2     America     48.9243992
3      Europe     48.1912325
4      Europe     45.5610018
5      Europe     53.6599893
6      Europe     31.0729044
7      Europe     42.1982432
8        Asia     41.0741040
9        Asia     41.8941056
10    America     19.9922301
11     Europe     56.4549252
12    Oceania     42.6352181
13     Europe     63.3327999
14     Europe     31.7656829
15     Europe     40.3289379
16     Europe     53.1464611
17     Europe     68.4794402
18     Europe     28.2988658
19     Europe     46.3098024
20    America     61.3913730
21     Europe     13.6013034
22     Africa     11.4794984
23    America     22.7459038
24     Europe     12.7149582
25     Europe     14.2096494
26       Asia      4.4414235
27     Europe     18.8852355
28     Europe     14.4196345
29    America     14.5962462
30     Europe     22.1814968
31       Asia      4.1593374
32    America     24.2586498
33       Asia     15.2432478
34    America     14.4555891
35     Africa     11.3663363
36     Europe     35.3080862
37     Africa      2.1035130
38     Europe     14.2570817
39     Africa      5.1944390
40    America      1.7671761
41       Asia      6.4968075
42       Asia     11.3715319
43       Asia             NA
44       Asia     25.5442798
45     Europe     15.9439522
46       Asia     27.5369185
47     Europe     12.3730381
48     Africa      1.2895430
49     Africa      5.1550745
50       Asia      4.7397688
51    America     12.7823763
52       Asia      8.5160944
53     Europe     28.5654642
54     Europe     26.6677254
55       Asia     97.7449546
56     Europe     38.0220498
57     Africa     12.6307475
58     Africa      4.1605979
59     Africa      2.5899756
60       Asia     18.0865077
61     Europe     12.3380028
62    America     21.5908335
63       Asia      7.5863849
64     Africa      3.5215316
65     Europe     17.3551285
66       Asia      2.1902403
67     Africa      3.1607771
68       Asia     11.3454449
69    America      8.6555299
70     Africa      2.1315479
71     Africa      0.7615242
72     Africa      3.6035489
73     Africa      0.9331096
74     Africa      1.5763187
75     Africa      3.4144160
76     Africa      1.0858937
77    America     17.7117655
78    America     11.5617492
79     Africa     20.3598451
80     Africa             NA
81     Africa     14.7433895
82     Africa      2.1442251
83    America      8.4484638
84     Africa      2.4984257
85     Africa      1.9491776
86    America      5.6722533
87       Asia     10.6600926
88    America      9.7379817
89       Asia      9.8539272
90     Africa      4.2038023
91       Asia     50.4785941
92       Asia      5.1331519
93       Asia      7.5928198
94       Asia     15.6120122
95     Africa      2.7485437
96     Africa      1.4970045
97     Africa     15.0179891
98     Africa      1.6131045
99     Africa      1.0425264
100    Africa      2.2831012
101      Asia     11.9155793
102    Africa      7.4376375
103    Africa      9.9319733
104      Asia      3.2527433
105   America      5.6948758
106    Africa      1.1964754
107      Asia      5.6616344
108      Asia     94.5026908
109    Africa      2.0885663
110    Africa      3.3147857
111    Africa      1.6632924
112    Africa             NA
113      Asia     12.8646063
114    Africa      8.6060760
115      Asia      3.2347216
116      Asia      3.0798948
117    Africa      1.5525477
118   America     26.2729084
119    Africa     10.7637856
120      Asia             NA
121    Africa      2.1221098
122      Asia      6.7554810
123      Asia             NA
124    Africa      3.1300295
125    Europe     55.6871893
126    Europe     51.2459962
127    Europe     39.4527679
128    Europe     56.1028219
129    Europe     29.7119219
130    Europe     83.4706240
131    Europe    114.1099725
132    Europe     34.0130711
133    Europe     32.0673594
134    Africa      6.9335093
135   America     19.4266534
136    Africa      5.0289201
137    Europe     27.5575376
138   America             NA
139    Europe     38.8224920
140   America      8.6155762
141   Oceania     13.8080655
142      Asia             NA
143    Europe     29.9420074
144    Europe     35.3899775
145    Europe     43.0640188
146    Africa      5.0424232
147      Asia      5.0291809
148      Asia     28.5938304
149   America     31.0491793
150   America     12.8499410
151      Asia             NA
152    Europe     20.6285619
153    Europe     56.1575139
154      Asia     39.5432202
155      Asia     47.5967279
156      Asia     66.9682699
157      Asia     46.2423150
158    Africa     17.6341423
159    Africa     22.2081897
160   Oceania      4.2359440
161      Asia             NA
162   America             NA
163      Asia     60.3889031

Looks good! But there must be many empty values. Let’s check.

summary(df)
   Country             Attitude       Prevalence         Law       
 Length:163         Min.   : 0.00   Min.   : 1.90   Min.   :0.250  
 Class :character   1st Qu.: 8.60   1st Qu.:18.30   1st Qu.:0.500  
 Mode  :character   Median :22.05   Median :24.60   Median :0.750  
                    Mean   :27.52   Mean   :28.96   Mean   :0.592  
                    3rd Qu.:42.52   3rd Qu.:35.00   3rd Qu.:0.750  
                    Max.   :92.10   Max.   :85.00   Max.   :1.000  
                    NA's   :11      NA's   :34                     
 CountryName         Continent         GDP_per_capita    
 Length:163         Length:163         Min.   :  0.7615  
 Class :character   Class :character   1st Qu.:  5.0289  
 Mode  :character   Mode  :character   Median : 13.6013  
                                       Mean   : 21.4992  
                                       3rd Qu.: 31.7657  
                                       Max.   :114.1100  
                                       NA's   :10        

Indeed, there are 11 empty values in Attitude and 34 in Prevalence.

Plots

Although we already have some amazing interactive visualization on the data source website, we need to do a bit more vis here.

Single Variable

df %>% 
  ggplot(aes(x = Attitude)) +
  geom_histogram(binwidth=2) + 
  labs(
    title = 'Histogram of Attitudes toward violence',
    subtitle = '152 countries included',
    x = 'Attitudes',
    y = 'Frequency'
  )

Positively skewed.

df %>% 
  ggplot(aes(x = Attitude)) +
  facet_wrap(. ~ Continent) +
  geom_histogram(binwidth=2.5) + 
  labs(
    title = 'Histogram of Attitudes toward violence',
    subtitle = 'faceted by 152 countries',
    x = 'Attitude',
    y = 'Frequency'
  )

df %>% 
  ggplot(aes(x = Prevalence)) +
  geom_histogram(binwidth=2) + 
  labs(
    title = 'Histogram of Prevalence of violence in the lifetime',
    subtitle = '129 countries included',
    x = 'Prevalence',
    y = 'Frequency'
  )

Positively skewed.

df %>% 
  ggplot(aes(x = Prevalence)) +
  facet_wrap(. ~ Continent) +
  geom_histogram(binwidth=2) + 
  labs(
    title = 'Histogram of Prevalence of violence in the lifetime',
    subtitle = 'faceted by 129 countries',
    x = 'Prevalence',
    y = 'Frequency'
  )

df %>% 
  ggplot(aes(x = Law)) +
  geom_bar() + 
  scale_x_continuous(breaks = c(0.25, 0.5, 0.75, 1.00)) + 
  geom_text(
      aes(label=after_stat(count)),
      stat = 'count',
      nudge_y = 3,
      va = 'bottom'
  ) +
  labs(
    title = 'Bar Chart of Laws on domestic violence',
    subtitle = '163 countries included',
    x = 'Law',
    y = 'Frequency'
  )

This one is not so informative because there are only 4 values. Are we really treating it as continuous?

df %>% 
  ggplot(aes(x = Law)) +
  facet_wrap(. ~ Continent) +
  geom_bar() + 
  scale_x_continuous(breaks = c(0.25, 0.5, 0.75, 1.00)) + 
  geom_text(
      aes(label=after_stat(count)),
      stat = 'count',
      nudge_y = 3,
      va = 'bottom'
  ) +
  labs(
    title = 'Bar Chart of Laws on domestic violence',
    subtitle = 'faceted by 163 countries',
    x = 'Law',
    y = 'Frequency'
  )

Multi-variable

df %>% 
  ggplot(aes(x = Attitude, 
             y = Prevalence, 
             color = as.factor(Law),
             shape = Continent)) +
  geom_point() + 
  labs(
    title = 'Scatterplot of Prevalence vs Attitude',
    x = 'Attitude',
    y = 'Prevalence',
    color = 'Law'
  ) + 
  scale_colour_viridis_d()

df %>% 
  ggplot(aes(x = Attitude, 
             y = Prevalence, 
             color = Continent,
             size = Law)) +
  geom_point(alpha = 0.8) + 
  labs(
    title = 'Scatterplot of Prevalence vs Attitude',
    x = 'Attitude',
    y = 'Prevalence',
    color = 'Continent'
  ) + 
  scale_colour_viridis_d()

cormat <- round(cor(df[c('Attitude', 'Law', 'Prevalence')] %>% drop_na()), 2)
corrplot(cormat, 
         # method = 'color',
         # order = 'alphabet'
  )

df %>% 
  ggplot(aes(x = Attitude, y = Prevalence)) +
  geom_point() + 
  # stat_smooth(method="lm") +
  labs(
    title = 'Scatterplot of Prevalence vs Attitude',
    x = 'Attitude',
    y = 'Prevalence'
  )

We can see a positive correlation.

# box plots are not as good as bar charts with error bars :(
df %>% 
  ggplot(aes(x = as.factor(Law), y = Prevalence)) +
  geom_boxplot() + 
  labs(
    title = 'Boxplot of Prevalence vs Law',
    x = 'Law',
    y = 'Prevalence'
  )

# summary statistics are not bad
df %>% 
  ggplot(aes(x = as.factor(Law), y = Prevalence)) +
   stat_summary() + 
  labs(
    title = 'Summary Statistics of Prevalence vs Law',
    x = 'Law',
    y = 'Prevalence'
  )

df %>% 
  ggplot(aes(x = Law, y = Prevalence)) +
  geom_point() + 
  # stat_smooth(method="lm") +
  labs(
    title = 'Scatterplot of Prevalence vs Law',
    x = 'Law',
    y = 'Prevalence'
  )

df %>% 
  ggplot(aes(x = as.factor(Law), y = Attitude)) +
  geom_boxplot() + 
  labs(
    title = 'Boxplot of Attitude vs Law',
    x = 'Law',
    y = 'Attitude'
  )

df %>% 
  ggplot(aes(x = as.factor(Law), y = Attitude)) +
   stat_summary() + 
  labs(
    title = 'Summary Statistics of Attitude vs Law',
    x = 'Law',
    y = 'Attitude'
  )

df %>% 
  ggplot(aes(x = Law, y = Attitude)) +
  geom_point() + 
  # stat_smooth(method="lm") +
  labs(
    title = 'Scatterplot of Attitude vs Law',
    x = 'Law',
    y = 'Attitude'
  )

Now, this is definitely categorical.

plot_ly(x=df$Attitude,
        y=df$Prevalence,
        z=df$Law,
        type="scatter3d")

This looks fancy but is pretty useless. I cannot see much from this 3D plot. I can’t see any obvious regression plane. The model is probably not going to have a high R-squared.

Regression (Not really useful now that I think about it)

Since we cannot see much from the plots, let’s do a regression analysis to quantify the relationship.

I think the y should be prevalence, because it seems to be the result of attitude and law. Let’s put the prevalence on the left hand side of the equation.

Now I think they are just three indicators. There is no need to run a regression across them.

# reg1 <- brm(formula = Prevalence ~ Attitude + Law,
#                        data=df, 
#                        refresh = 0,
#                        seed = 123) # stabilize the outcome for reproducibility
# summary(reg1)

Alcohol (Total) - Spurious Association

alcohol = read.csv('./Data/alcohol.csv')
colnames(alcohol) <- c('CountryName', 'Country', 'Alcohol_pc')
df <- left_join(df, alcohol, by = 'Country')
df %>% 
  ggplot(aes(x = Alcohol_pc, y = Prevalence)) +
  geom_point() + 
  stat_smooth(method="lm") + 
  labs(
    title = 'Prevalence vs Alcohol Consumption per capita',
    x = 'Alcohol Consumption per capita',
    y = 'Prevalence'
  )

reg2 <- brm(formula = Prevalence ~ Alcohol_pc,
                       data=df, 
                       refresh = 0,
                       seed = 123) # stabilize the outcome for reproducibility
summary(reg2)
 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: Prevalence ~ Alcohol_pc 
   Data: df (Number of observations: 128) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Population-Level Effects: 
           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept     39.29      2.65    34.01    44.47 1.00     3744
Alcohol_pc    -1.52      0.35    -2.21    -0.83 1.00     3758
           Tail_ESS
Intercept      2797
Alcohol_pc     2639

Family Specific Parameters: 
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma    15.26      0.94    13.52    17.28 1.00     3812     2794

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

poverty_headcount_ratio - Potential Confounder

poverty_headcount_ratio  = read.csv('./Data/poverty_cleaned.csv')
df <- left_join(df, poverty_headcount_ratio, by = 'Country')
reg3 <- brm(formula = Prevalence ~ Alcohol_pc + poverty_headcount_ratio,
                       data=df, 
                       refresh = 0,
                       seed = 123) # stabilize the outcome for reproducibility
summary(reg3)
 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: Prevalence ~ Alcohol_pc + poverty_headcount_ratio 
   Data: df (Number of observations: 118) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Population-Level Effects: 
                        Estimate Est.Error l-95% CI u-95% CI Rhat
Intercept                  26.59      4.26    18.13    35.10 1.00
Alcohol_pc                 -0.96      0.37    -1.65    -0.22 1.00
poverty_headcount_ratio     0.33      0.09     0.16     0.50 1.00
                        Bulk_ESS Tail_ESS
Intercept                   3212     2655
Alcohol_pc                  3767     3002
poverty_headcount_ratio     3275     3185

Family Specific Parameters: 
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma    14.33      0.95    12.61    16.38 1.00     3850     2786

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

The absolute value, or the magnitude, of the coefficient of alcohol_pc has dropped from 1.52 to 0.96 (about 37%), so we can conclude that poverty_headcount_ratio is a confounder.

And there are not too many nan’s for the poverty_headcount_ratio! Awesome! We found one!

summary(df[c('Prevalence','poverty_headcount_ratio')])
   Prevalence    poverty_headcount_ratio
 Min.   : 1.90   Min.   : 0.60          
 1st Qu.:18.30   1st Qu.:15.10          
 Median :24.60   Median :23.50          
 Mean   :28.96   Mean   :27.98          
 3rd Qu.:35.00   3rd Qu.:40.00          
 Max.   :85.00   Max.   :76.80          
 NA's   :34      NA's   :24             

Causal Diagram

vaw <- dagitty("dag{
        VaW -> Law;
        VaW -> Attitude;
        VaW -> Prevalence;
        AlcoholConsumption -> Prevalence;
        PovertyHeadcount -> Prevalence;
        PovertyHeadcount -> AlcoholConsumption;
        Alcohol -> AlcoholConsumption;
        Poverty -> PovertyHeadcount;
}")

ggdag(vaw)

The text is too long… How can I fix it???

vaw_dag <- dagify(Law ~ vaw,
       Attitude ~ vaw,
       Prevalence ~ vaw + ph + ac + Law + Attitude,
       ac ~ a + ph,
       ph ~ p,
       labels = c("vaw" = "VaW", 
                  "Law" = "Law",
                  "Attitude" = "Attitude",
                  "Prevalence" = "Prevalence",
                  "ph" = "Poverty\n Headcount",
                  "p" = "Poverty",
                  "ac" = "Alcohol\n Consumption",
                  "a" = "Alcohol"
                  ),
       latent = "vaw")

ggdag(vaw_dag, text = FALSE, use_labels = "label")

Q: What are adjustment sets???

# find variables that aren't related
impliedConditionalIndependencies(vaw_dag)
Attt _||_ a
Attt _||_ ac
Attt _||_ p
Attt _||_ ph
Law _||_ a
Law _||_ ac
Law _||_ p
Law _||_ ph
Prvl _||_ a | ac, ph
Prvl _||_ p | ph
a _||_ p
a _||_ ph
ac _||_ p | ph
# find the adjustment set
adjustmentSets(vaw_dag, exposure="ac", outcome = "Prevalence")
{ ph }
# ggdag version
ggdag_adjustment_set(vaw_dag, exposure="ac", outcome = "Prevalence")

df %>% 
  ggplot(aes(x = poverty_headcount_ratio, y = Prevalence)) +
  geom_point() + 
  stat_smooth(method="lm") + 
  labs(
    title = 'Prevalence vs poverty_headcount_ratio',
    x = 'poverty_headcount_ratio',
    y = 'Prevalence'
  )